suppressMessages(
suppressWarnings(
c(library(DESeq2),
library(data.table),
library(gdata),
library(rtracklayer),
library(BSgenome),
library(VennDiagram),
library(ggplot2),
library(biomaRt),
library(RColorBrewer),
library(readr),
library(ensembldb),
library(EnsDb.Mmusculus.v79),
library(org.Mm.eg.db),
library(AnnotationDbi),
library(tidyverse),
library(plotly),
library(knitr)
)))
## [1] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [4] "matrixStats" "Biobase" "GenomicRanges"
## [7] "GenomeInfoDb" "IRanges" "S4Vectors"
## [10] "BiocGenerics" "parallel" "stats4"
## [13] "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods"
## [19] "base" "data.table" "DESeq2"
## [22] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [25] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [28] "IRanges" "S4Vectors" "BiocGenerics"
## [31] "parallel" "stats4" "stats"
## [34] "graphics" "grDevices" "utils"
## [37] "datasets" "methods" "base"
## [40] "gdata" "data.table" "DESeq2"
## [43] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [46] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [49] "IRanges" "S4Vectors" "BiocGenerics"
## [52] "parallel" "stats4" "stats"
## [55] "graphics" "grDevices" "utils"
## [58] "datasets" "methods" "base"
## [61] "rtracklayer" "gdata" "data.table"
## [64] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [67] "matrixStats" "Biobase" "GenomicRanges"
## [70] "GenomeInfoDb" "IRanges" "S4Vectors"
## [73] "BiocGenerics" "parallel" "stats4"
## [76] "stats" "graphics" "grDevices"
## [79] "utils" "datasets" "methods"
## [82] "base" "BSgenome" "Biostrings"
## [85] "XVector" "rtracklayer" "gdata"
## [88] "data.table" "DESeq2" "SummarizedExperiment"
## [91] "DelayedArray" "matrixStats" "Biobase"
## [94] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [97] "S4Vectors" "BiocGenerics" "parallel"
## [100] "stats4" "stats" "graphics"
## [103] "grDevices" "utils" "datasets"
## [106] "methods" "base" "VennDiagram"
## [109] "futile.logger" "grid" "BSgenome"
## [112] "Biostrings" "XVector" "rtracklayer"
## [115] "gdata" "data.table" "DESeq2"
## [118] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [121] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [124] "IRanges" "S4Vectors" "BiocGenerics"
## [127] "parallel" "stats4" "stats"
## [130] "graphics" "grDevices" "utils"
## [133] "datasets" "methods" "base"
## [136] "ggplot2" "VennDiagram" "futile.logger"
## [139] "grid" "BSgenome" "Biostrings"
## [142] "XVector" "rtracklayer" "gdata"
## [145] "data.table" "DESeq2" "SummarizedExperiment"
## [148] "DelayedArray" "matrixStats" "Biobase"
## [151] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [154] "S4Vectors" "BiocGenerics" "parallel"
## [157] "stats4" "stats" "graphics"
## [160] "grDevices" "utils" "datasets"
## [163] "methods" "base" "biomaRt"
## [166] "ggplot2" "VennDiagram" "futile.logger"
## [169] "grid" "BSgenome" "Biostrings"
## [172] "XVector" "rtracklayer" "gdata"
## [175] "data.table" "DESeq2" "SummarizedExperiment"
## [178] "DelayedArray" "matrixStats" "Biobase"
## [181] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [184] "S4Vectors" "BiocGenerics" "parallel"
## [187] "stats4" "stats" "graphics"
## [190] "grDevices" "utils" "datasets"
## [193] "methods" "base" "RColorBrewer"
## [196] "biomaRt" "ggplot2" "VennDiagram"
## [199] "futile.logger" "grid" "BSgenome"
## [202] "Biostrings" "XVector" "rtracklayer"
## [205] "gdata" "data.table" "DESeq2"
## [208] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [211] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [214] "IRanges" "S4Vectors" "BiocGenerics"
## [217] "parallel" "stats4" "stats"
## [220] "graphics" "grDevices" "utils"
## [223] "datasets" "methods" "base"
## [226] "readr" "RColorBrewer" "biomaRt"
## [229] "ggplot2" "VennDiagram" "futile.logger"
## [232] "grid" "BSgenome" "Biostrings"
## [235] "XVector" "rtracklayer" "gdata"
## [238] "data.table" "DESeq2" "SummarizedExperiment"
## [241] "DelayedArray" "matrixStats" "Biobase"
## [244] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [247] "S4Vectors" "BiocGenerics" "parallel"
## [250] "stats4" "stats" "graphics"
## [253] "grDevices" "utils" "datasets"
## [256] "methods" "base" "ensembldb"
## [259] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [262] "readr" "RColorBrewer" "biomaRt"
## [265] "ggplot2" "VennDiagram" "futile.logger"
## [268] "grid" "BSgenome" "Biostrings"
## [271] "XVector" "rtracklayer" "gdata"
## [274] "data.table" "DESeq2" "SummarizedExperiment"
## [277] "DelayedArray" "matrixStats" "Biobase"
## [280] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [283] "S4Vectors" "BiocGenerics" "parallel"
## [286] "stats4" "stats" "graphics"
## [289] "grDevices" "utils" "datasets"
## [292] "methods" "base" "EnsDb.Mmusculus.v79"
## [295] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [298] "AnnotationDbi" "readr" "RColorBrewer"
## [301] "biomaRt" "ggplot2" "VennDiagram"
## [304] "futile.logger" "grid" "BSgenome"
## [307] "Biostrings" "XVector" "rtracklayer"
## [310] "gdata" "data.table" "DESeq2"
## [313] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [316] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [319] "IRanges" "S4Vectors" "BiocGenerics"
## [322] "parallel" "stats4" "stats"
## [325] "graphics" "grDevices" "utils"
## [328] "datasets" "methods" "base"
## [331] "org.Mm.eg.db" "EnsDb.Mmusculus.v79" "ensembldb"
## [334] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [337] "readr" "RColorBrewer" "biomaRt"
## [340] "ggplot2" "VennDiagram" "futile.logger"
## [343] "grid" "BSgenome" "Biostrings"
## [346] "XVector" "rtracklayer" "gdata"
## [349] "data.table" "DESeq2" "SummarizedExperiment"
## [352] "DelayedArray" "matrixStats" "Biobase"
## [355] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [358] "S4Vectors" "BiocGenerics" "parallel"
## [361] "stats4" "stats" "graphics"
## [364] "grDevices" "utils" "datasets"
## [367] "methods" "base" "org.Mm.eg.db"
## [370] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [373] "GenomicFeatures" "AnnotationDbi" "readr"
## [376] "RColorBrewer" "biomaRt" "ggplot2"
## [379] "VennDiagram" "futile.logger" "grid"
## [382] "BSgenome" "Biostrings" "XVector"
## [385] "rtracklayer" "gdata" "data.table"
## [388] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [391] "matrixStats" "Biobase" "GenomicRanges"
## [394] "GenomeInfoDb" "IRanges" "S4Vectors"
## [397] "BiocGenerics" "parallel" "stats4"
## [400] "stats" "graphics" "grDevices"
## [403] "utils" "datasets" "methods"
## [406] "base" "forcats" "stringr"
## [409] "dplyr" "purrr" "tidyr"
## [412] "tibble" "tidyverse" "org.Mm.eg.db"
## [415] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [418] "GenomicFeatures" "AnnotationDbi" "readr"
## [421] "RColorBrewer" "biomaRt" "ggplot2"
## [424] "VennDiagram" "futile.logger" "grid"
## [427] "BSgenome" "Biostrings" "XVector"
## [430] "rtracklayer" "gdata" "data.table"
## [433] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [436] "matrixStats" "Biobase" "GenomicRanges"
## [439] "GenomeInfoDb" "IRanges" "S4Vectors"
## [442] "BiocGenerics" "parallel" "stats4"
## [445] "stats" "graphics" "grDevices"
## [448] "utils" "datasets" "methods"
## [451] "base" "plotly" "forcats"
## [454] "stringr" "dplyr" "purrr"
## [457] "tidyr" "tibble" "tidyverse"
## [460] "org.Mm.eg.db" "EnsDb.Mmusculus.v79" "ensembldb"
## [463] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [466] "readr" "RColorBrewer" "biomaRt"
## [469] "ggplot2" "VennDiagram" "futile.logger"
## [472] "grid" "BSgenome" "Biostrings"
## [475] "XVector" "rtracklayer" "gdata"
## [478] "data.table" "DESeq2" "SummarizedExperiment"
## [481] "DelayedArray" "matrixStats" "Biobase"
## [484] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [487] "S4Vectors" "BiocGenerics" "parallel"
## [490] "stats4" "stats" "graphics"
## [493] "grDevices" "utils" "datasets"
## [496] "methods" "base" "knitr"
## [499] "plotly" "forcats" "stringr"
## [502] "dplyr" "purrr" "tidyr"
## [505] "tibble" "tidyverse" "org.Mm.eg.db"
## [508] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [511] "GenomicFeatures" "AnnotationDbi" "readr"
## [514] "RColorBrewer" "biomaRt" "ggplot2"
## [517] "VennDiagram" "futile.logger" "grid"
## [520] "BSgenome" "Biostrings" "XVector"
## [523] "rtracklayer" "gdata" "data.table"
## [526] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [529] "matrixStats" "Biobase" "GenomicRanges"
## [532] "GenomeInfoDb" "IRanges" "S4Vectors"
## [535] "BiocGenerics" "parallel" "stats4"
## [538] "stats" "graphics" "grDevices"
## [541] "utils" "datasets" "methods"
## [544] "base"
Set up the function
plotCDF.ggplot <- function(gene.counts, gene.sets, gene.set.labels,
col = "", linetype = "", xlim = c( -1.0, 1.3 ),
legend.size = 22, axistitle.size = 22, title = "Fold change log2 (Dicer KO/WT)",
legend.pos = c(0.7, 0.18)) {
require(ggplot2)
df.log2fc <- gene.counts[,c("gene", "log2FC")]
#rownames(df.log2fc) <- df.log2fc$gene
if (length(gene.sets) != length(gene.set.labels)){
return("Length of gene sets doesn't match labels")
}
target.expr <- df.log2fc[df.log2fc$gene %in% gene.sets[[1]],]
for (i in 2:length(gene.sets)){
target.expr <- rbind(target.expr, df.log2fc[df.log2fc$gene %in% gene.sets[[i]],])
}
gene.set.counts <- c()
for (j in 1:length(gene.sets)){
gene.set.counts <- c(gene.set.counts, sum(df.log2fc$gene %in% gene.sets[[j]]))
}
target.expr$Category <- rep(gene.set.labels, gene.set.counts)
target.expr$Category <- factor(target.expr$Category, levels = gene.set.labels)
log2FC.values <- lapply(gene.sets, function(gene.set) {
gene.counts[gene.counts$gene %in% gene.set,]$log2FC
})
ks.pvals <- lapply(log2FC.values,
function(log2FCs) {
ks.test(log2FCs, log2FC.values[[1]])$p.value
})
p <- ggplot( target.expr, aes( x = log2FC, colour = Category ) ) +
stat_ecdf( geom = 'step', aes( colour = Category, linetype = Category ), lwd = 1 ) +
scale_color_manual( values = col, labels = sprintf( "%s (%d)", gene.set.labels, gene.set.counts ) ) +
scale_linetype_manual( values = linetype, labels = sprintf( "%s (%d)", gene.set.labels, gene.set.counts ) ) +
# xlim() will remove data points; Be careful in the future
coord_cartesian( xlim = xlim ) + xlab(title) + ylab('CDF') +
theme_classic() + theme( legend.background = element_rect(fill = NA),
legend.title = element_blank(),
legend.position = legend.pos,
legend.text = element_text(size=legend.size),
legend.key.size = unit(1.5, 'lines'),
axis.title.x = element_text(size=axistitle.size, margin = margin(t = 10)),
axis.title.y = element_text(size=axistitle.size, margin = margin(r = 10)),
axis.text=element_text(size=20),
axis.line = element_line(size = 1), #axis label size
axis.ticks.length = unit(0.3, "cm")) #increase tick length
for (k in 2:length(gene.sets)){
p <- p + annotate(geom = "text", x = -0.9, y = 1-0.08*(k-1), hjust = 0,
label = sprintf("p = %.0e", ks.pvals[k]),
colour = col[k], size = 8)
}
print(p)
}
Load the enema model RNA-Seq DGE dataset.
rna_DGE <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon tumor/4-OHT enema model/Analysis/Differential Analysis_filtered.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_character(),
## baseMean = col_double(),
## log2FoldChange = col_double(),
## lfcSE = col_double(),
## stat = col_double(),
## pvalue = col_double(),
## padj = col_double()
## )
Now load the list of peaks associated with each miRNA.
peaks.mir <- readRDS("Datafiles/peaks-mirs-200-09282019.rds")
# First we need to nnotate each peak with its potential target gene, 3'UTR annotation gets priority.
peaks.mir$'target_gene' <- NA
for (i in 1:length(peaks.mir)) {
if (!is.na(peaks.mir$utr3[i]) | !is.na(peaks.mir$`utr3*`[i])) {
gene_name <- unique(c(peaks.mir$utr3[i],peaks.mir$`utr3*`[i]))
gene_name <- gene_name[!is.na(gene_name)]
peaks.mir$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
}
else {
gene_name <- unique(c(peaks.mir$exon[i], peaks.mir$intron[i],peaks.mir$utr5[i],peaks.mir$`utr5*`[i]))
gene_name <- gene_name[!is.na(gene_name)]
if (length(gene_name) >0) {
peaks.mir$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
}
}
}
Now I need to annotate the target list with both Ensembl ID and Uniprot ID and Enterez ID (for KEGG and GSEA).
# annotate with Ensembl ID
# Ensembl IDs are annotated using `EnsDb.Mmusculus.v79` package since that is the one that I used for RNA-Seq analysis
target_gene_list <- peaks.mir$target_gene[!is.na(peaks.mir$target_gene)]
annotations_ensembl <- AnnotationDbi::select(EnsDb.Mmusculus.v79,
keys = as.character(target_gene_list),
columns = c("GENEID"),
keytype = "GENENAME")
# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_ensembl$GENEID) == FALSE)
# Return only the non-duplicated genes using indices
annotations_ensembl <- annotations_ensembl[non_duplicates_idx, ]
# Check number of NAs returned
is.na(annotations_ensembl$GENENAME) %>%
which() %>%
length()
## [1] 0
# annotate the dataset with Ensembl ID
peaks.mir$target_Ensembl_ID <- NA
peaks.mir <- peaks.mir[!is.na(peaks.mir$target_gene)]
for (i in 1:length(peaks.mir)) {
index <- grep(peaks.mir$target_gene[i], annotations_ensembl$GENENAME)
if (length(index)>0) {
peaks.mir$target_Ensembl_ID[i] <- paste(annotations_ensembl$GENEID[index], collapse = " ")
}
}
# annotate the dataset with Uniprot ID
## first I need to generate a list of genes to retrieve Uniprot ID from the website
peak.targets <- unique(peaks.mir$target_gene)
write.csv(peak.targets, "all_peaks.csv")
uniprot_ID <- read_csv("all_peaks_Uniprot.csv")
## Parsed with column specification:
## cols(
## gene_name = col_character(),
## Entry = col_character(),
## `Entry name` = col_character(),
## Status = col_character(),
## `Protein names` = col_character(),
## `Gene names` = col_character(),
## Organism = col_character(),
## Length = col_double()
## )
peaks.mir$target_Uniprot_ID <- NA
for (i in 1:length(peaks.mir)) {
index <- grep(peaks.mir$target_gene[i], uniprot_ID$gene_name)
if (length(index)>0) {
peaks.mir$target_Uniprot_ID[i] <- paste(uniprot_ID$Entry[index], collapse = " ")
}
}
## filter out targets with no ID
no_id <- setdiff(peak.targets, uniprot_ID$gene_name)
write.csv(no_id, "no_UniprotID.csv")
## load the manually annotated ID list
no.id.2.id <- read_csv("no_UniprotID_2_ID.csv", col_names = FALSE)
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character()
## )
for (i in 1:dim(no.id.2.id)[1]) {
index <- grep(no.id.2.id$X1[i], peaks.mir$target_gene)
if (length(index)>0) {
peaks.mir$target_Uniprot_ID[index] <- paste(no.id.2.id$X2[i], collapse = " ")
}
}
# number of genes with UniprotID
length(unique(peaks.mir$target_Uniprot_ID[!is.na(peaks.mir$target_Uniprot_ID)]))
## [1] 3051
# annotate with Entrez ID
# Entrez IDs are annotated using `org.Mm.eg.db` package since this is the most updated
annotations_entrez <- AnnotationDbi::select(org.Mm.eg.db,
keys = as.character(target_gene_list),
columns = c("ENTREZID"),
keytype = "SYMBOL")
## 'select()' returned many:1 mapping between keys and columns
# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_entrez$ENTREZID) == FALSE)
# Return only the non-duplicated genes using indices
annotations_entrez <- annotations_entrez[non_duplicates_idx, ]
# Check number of NAs returned
is.na(annotations_entrez$ENTREZID) %>%
which() %>%
length()
## [1] 1
# annotate the dataset with Entrez ID
peaks.mir$target_Entrez_ID <- NA
peaks.mir <- peaks.mir[!is.na(peaks.mir$target_gene)]
for (i in 1:length(peaks.mir)) {
index <- grep(peaks.mir$target_gene[i], annotations_entrez$SYMBOL)
if (length(index)>0) {
peaks.mir$target_Entrez_ID[i] <- paste(annotations_entrez$ENTREZID[index], collapse = " ")
}
}
# save the datafile
saveRDS(peaks.mir, "Datafiles/peaks-mirs-200-09282019-withID.rds")
Now we group peaks by miRNA.
targetofmiR <- function(peaks.mir = brain.peaks.mirs,
miRNA = "",
sitetype = "8mer"){
peaks.mir.sub <- as.data.frame(peaks.mir[,c("log2FC", "padj",
"seed.8mer", "seed.7m8", "seed.7A1", "seed.6mer")])
peaks.seedmatch <- lapply(c("seed.8mer", "seed.7m8", "seed.7A1", "seed.6mer"),
function(seed){
map <- peaks.mir.sub[grepl(miRNA, peaks.mir.sub[,seed]),]
map <- rownames(map)
map
})
names(peaks.seedmatch) <- c("seed.8mer", "seed.7m8", "seed.7A1", "seed.6mer")
if (sitetype == "8mer"){
maps <- peaks.seedmatch[[1]]
} else if (sitetype == "7mer_above"){
maps <- unique(unlist(peaks.seedmatch[1:3]))
} else if (sitetype == "7mer"){
maps <- unique(unlist(peaks.seedmatch[2:3]))
} else if (sitetype == "6mer"){
maps <- peaks.seedmatch[[4]]
} else {
print("Please input site type as: 8mer, 7mer_above, 7mer or 6mer")
}
return(peaks.mir[maps])
}
mirna.family.DGE <- readRDS("Datafiles/mirna-counts-deseq-by-family-09282019.rds")
mirs <- subset(mirna.family.DGE, baseMean > 200)
mirs <- mirs$miR.family
mirs.peaks <- lapply(mirs,
function(mir){
targetofmiR(miRNA = mir,
peaks.mir = peaks.mir,
sitetype = "7mer_above")
})
names(mirs.peaks) <- mirs
saveRDS(mirs.peaks, "Datafiles/miRNA-peaks-list-09282019-withIDs.rds")
Now we can draw the CAD plots for specific miRNAs. I will do the top 10 that have the most peaks associated.
colnames(rna_DGE)[1] <- "gene"
colnames(rna_DGE)[3] <- "log2FC"
cols <- c(brewer.pal(name = "Set2", n = 8), brewer.pal(name = "Set3", n = 3))
plotCDF.ggplot(rna_DGE,
list(rna_DGE$gene, peaks.mir$target_Ensembl_ID),
c("All genes", "miRNAs over 200"),
col = c("grey15", cols[1]),
linetype = c(1, 1),
title = "RNA_LFC(K-Ras_G12D/WT)"
)
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
len <- sapply(mirs.peaks, function(x) length(x))
mirs.peaks <- mirs.peaks[order(-len)]
mirna <- names(mirs.peaks)
for (i in 1:10) {
plotCDF.ggplot(rna_DGE,
list(rna_DGE$gene, mirs.peaks[[i]]$target_Ensembl_ID),
c("All genes", mirna[i]),
col = c("grey15", cols[i]),
linetype = c(1, 1),
title = "RNA_LFC(K-Ras_G12D/WT)"
)
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
png('RNA_CDF.png',
height = 800,
width = 1400,
res = 100,
pointsize = 8)
plotCDF.ggplot(rna_DGE,
list(rna_DGE$gene,
mirs.peaks[[1]]$target_Ensembl_ID,
mirs.peaks[[2]]$target_Ensembl_ID,
mirs.peaks[[3]]$target_Ensembl_ID,
mirs.peaks[[4]]$target_Ensembl_ID,
mirs.peaks[[5]]$target_Ensembl_ID,
mirs.peaks[[6]]$target_Ensembl_ID,
mirs.peaks[[7]]$target_Ensembl_ID,
mirs.peaks[[8]]$target_Ensembl_ID,
mirs.peaks[[9]]$target_Ensembl_ID,
mirs.peaks[[10]]$target_Ensembl_ID),
c("All genes", mirna[1], mirna[2], mirna[3], mirna[4], mirna[5], mirna[6], mirna[7], mirna[8], mirna[9], mirna[10]),
col = c("grey15", cols[1:10]),
linetype = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
title = "RNA_LFC(K-Ras_G12D/WT)"
)
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
dev.off()
## quartz_off_screen
## 2
include_graphics('RNA_CDF.png')
pdf('RNA_CDF.pdf',
height = 8,
width = 14)
plotCDF.ggplot(rna_DGE,
list(rna_DGE$gene,
mirs.peaks[[1]]$target_Ensembl_ID,
mirs.peaks[[2]]$target_Ensembl_ID,
mirs.peaks[[3]]$target_Ensembl_ID,
mirs.peaks[[4]]$target_Ensembl_ID,
mirs.peaks[[5]]$target_Ensembl_ID,
mirs.peaks[[6]]$target_Ensembl_ID,
mirs.peaks[[7]]$target_Ensembl_ID,
mirs.peaks[[8]]$target_Ensembl_ID,
mirs.peaks[[9]]$target_Ensembl_ID,
mirs.peaks[[10]]$target_Ensembl_ID),
c("All genes", mirna[1], mirna[2], mirna[3], mirna[4], mirna[5], mirna[6], mirna[7], mirna[8], mirna[9], mirna[10]),
col = c("grey15", cols[1:10]),
linetype = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
title = "RNA_LFC(K-Ras_G12D/WT)"
)
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
dev.off()
## quartz_off_screen
## 2
# define the function
plotBoxplot.ggplot <- function(gene.counts, gene.sets, gene.set.labels) {
require(ggplot2)
df.log2fc <- gene.counts[,c("gene", "log2FC")]
#rownames(df.log2fc) <- df.log2fc$gene
if (length(gene.sets) != length(gene.set.labels)){
return("Length of gene sets doesn't match labels")
}
target.expr <- df.log2fc[df.log2fc$gene %in% gene.sets[[1]],]
for (i in 2:length(gene.sets)){
target.expr <- rbind(target.expr, df.log2fc[df.log2fc$gene %in% gene.sets[[i]],])
}
gene.set.counts <- c()
for (j in 1:length(gene.sets)){
gene.set.counts <- c(gene.set.counts, sum(df.log2fc$gene %in% gene.sets[[j]]))
}
target.expr$Category <- rep(gene.set.labels, gene.set.counts)
target.expr$Category <- factor(target.expr$Category, levels = gene.set.labels)
p <- ggplot(target.expr, aes(x=Category, y=log2FC, fill = Category)) + geom_boxplot() + xlab("") +
ylab("log2FC(KRas-G12D/WT)") + scale_fill_manual(values = cols[1:length(gene.sets)]) + theme(axis.text.x =element_text(angle = 90, hjust = 1))
print(p)
}
#get a list of gene sets from the top 10 miRNA
gene.list <- c()
for (i in 1:10) {
gene.list <- c(gene.list, list(mirs.peaks[[i]]$target_Ensembl_ID))
}
plotBoxplot.ggplot(rna_DGE,
c(list(rna_DGE$gene), gene.list),
c("All genes", mirna[1:10])
) + ylim(-2,2)
Load the proteomic dataset
protein_DGE <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/colon tumor-enema model/crcMS_diff.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## `Protein Id` = col_character(),
## Description = col_character(),
## p_values = col_double(),
## q_values = col_double(),
## foldChange = col_double(),
## LFC = col_double()
## )
colnames(protein_DGE)[2] <- "gene"
colnames(protein_DGE)[7] <- "log2FC"
cols <- c(brewer.pal(name = "Set2", n = 8), brewer.pal(name = "Set3", n = 3))
plotCDF.ggplot(protein_DGE,
list(protein_DGE$gene, peaks.mir$target_Uniprot_ID),
c("All genes", "miRNAs over 200"),
col = c("grey15", cols[1]),
linetype = c(1, 1),
title = "Protein_LFC(K-Ras_G12D/WT)"
)
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
for (i in 1:10) {
plotCDF.ggplot(protein_DGE,
list(protein_DGE$gene, mirs.peaks[[i]]$target_Uniprot_ID),
c("All genes", mirna[i]),
col = c("grey15", cols[i]),
linetype = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
title = "Protein_LFC(K-Ras_G12D/WT)"
)
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
png('Protein_CDF.png',
height = 800,
width = 1400,
res = 100,
pointsize = 8)
plotCDF.ggplot(protein_DGE,
list(protein_DGE$gene,
mirs.peaks[[1]]$target_Uniprot_ID,
mirs.peaks[[2]]$target_Uniprot_ID,
mirs.peaks[[3]]$target_Uniprot_ID,
mirs.peaks[[4]]$target_Uniprot_ID,
mirs.peaks[[5]]$target_Uniprot_ID,
mirs.peaks[[6]]$target_Uniprot_ID,
mirs.peaks[[7]]$target_Uniprot_ID,
mirs.peaks[[8]]$target_Uniprot_ID,
mirs.peaks[[9]]$target_Uniprot_ID,
mirs.peaks[[10]]$target_Uniprot_ID),
c("All genes", mirna[1], mirna[2], mirna[3], mirna[4], mirna[5], mirna[6], mirna[7], mirna[8], mirna[9], mirna[10]),
col = c("grey15", cols[1:10]),
linetype = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
title = "Protein_LFC(K-Ras_G12D/WT)"
)
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
dev.off()
## quartz_off_screen
## 2
include_graphics('Protein_CDF.png')
pdf('Protein_CDF.pdf',
height = 8,
width = 14)
plotCDF.ggplot(protein_DGE,
list(protein_DGE$gene,
mirs.peaks[[1]]$target_Uniprot_ID,
mirs.peaks[[2]]$target_Uniprot_ID,
mirs.peaks[[3]]$target_Uniprot_ID,
mirs.peaks[[4]]$target_Uniprot_ID,
mirs.peaks[[5]]$target_Uniprot_ID,
mirs.peaks[[6]]$target_Uniprot_ID,
mirs.peaks[[7]]$target_Uniprot_ID,
mirs.peaks[[8]]$target_Uniprot_ID,
mirs.peaks[[9]]$target_Uniprot_ID,
mirs.peaks[[10]]$target_Uniprot_ID),
c("All genes", mirna[1], mirna[2], mirna[3], mirna[4], mirna[5], mirna[6], mirna[7], mirna[8], mirna[9], mirna[10]),
col = c("grey15", cols[1:10]),
linetype = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
title = "Protein_LFC(K-Ras_G12D/WT)"
)
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
dev.off()
## quartz_off_screen
## 2
# get a list of gene sets from the top 10 miRNA
gene.list <- c()
for (i in 1:10) {
gene.list <- c(gene.list, list(mirs.peaks[[i]]$target_Uniprot_ID))
}
plotBoxplot.ggplot(protein_DGE,
c(list(protein_DGE$gene), gene.list),
c("All genes", mirna[1:10])
) + ylim(-1,1)
## Warning: Removed 118 rows containing non-finite values (stat_boxplot).
This analysis will use the LFC of CLIP peak heights between HVAK and HVA samples.
clip_DGE <- as.data.frame(cbind(peaks.mir$name, peaks.mir$hvak.hva.log2FC))
colnames(clip_DGE)[1] <- "gene"
colnames(clip_DGE)[2] <- "log2FC"
clip_DGE$log2FC <- as.numeric(as.character(clip_DGE$log2FC))
cols <- c(brewer.pal(name = "Set2", n = 8), brewer.pal(name = "Set3", n = 3))
for (i in 1:10) {
plotCDF.ggplot(clip_DGE,
list(clip_DGE$gene, mirs.peaks[[i]]$name),
c("All peaks", mirna[i]),
col = c("grey15", cols[i]),
linetype = c(1, 1),
title = "HEAP-CLIP_LFC(K-Ras_G12D/WT)",
xlim = c(-1,3)
)
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
png('CLIP_CDF.png',
height = 800,
width = 1400,
res = 100,
pointsize = 8)
plotCDF.ggplot(clip_DGE,
list(clip_DGE$gene,
mirs.peaks[[1]]$name,
mirs.peaks[[2]]$name,
mirs.peaks[[3]]$name,
mirs.peaks[[4]]$name,
mirs.peaks[[5]]$name,
mirs.peaks[[6]]$name,
mirs.peaks[[7]]$name,
mirs.peaks[[8]]$name,
mirs.peaks[[9]]$name,
mirs.peaks[[10]]$name),
c("All genes", mirna[1], mirna[2], mirna[3], mirna[4], mirna[5], mirna[6], mirna[7], mirna[8], mirna[9], mirna[10]),
col = c("grey15", cols[1:10]),
linetype = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
title = "HEAP-CLIP_LFC(K-Ras_G12D/WT)"
)
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
dev.off()
## quartz_off_screen
## 2
include_graphics('CLIP_CDF.png')
pdf('CLIP_CDF.pdf',
height = 8,
width = 14)
plotCDF.ggplot(clip_DGE,
list(clip_DGE$gene,
mirs.peaks[[1]]$name,
mirs.peaks[[2]]$name,
mirs.peaks[[3]]$name,
mirs.peaks[[4]]$name,
mirs.peaks[[5]]$name,
mirs.peaks[[6]]$name,
mirs.peaks[[7]]$name,
mirs.peaks[[8]]$name,
mirs.peaks[[9]]$name,
mirs.peaks[[10]]$name),
c("All genes", mirna[1], mirna[2], mirna[3], mirna[4], mirna[5], mirna[6], mirna[7], mirna[8], mirna[9], mirna[10]),
col = c("grey15", cols[1:10]),
linetype = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
title = "HEAP-CLIP_LFC(K-Ras_G12D/WT)"
)
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
dev.off()
## quartz_off_screen
## 2
# get a list of gene sets from the top 10 miRNA
gene.list <- c()
for (i in 1:10) {
gene.list <- c(gene.list, list(mirs.peaks[[i]]$name))
}
plotBoxplot.ggplot(clip_DGE,
c(list(clip_DGE$gene), gene.list),
c("All genes", mirna[1:10])
)
# filter for miRNA with mean counts > 200
mirna.family.DGE <- mirna.family.DGE[mirna.family.DGE$baseMean>200,]
mirna.family.DGE$peak.number <- NA
for (i in 1:length(mirna.family.DGE$miR.family)) {
peak.number <- length(mirs.peaks[[i]])
index <- grep(names(mirs.peaks)[i], mirna.family.DGE$miR.family)
mirna.family.DGE$peak.number[index] <- peak.number
}
p1 <- ggplot(mirna.family.DGE, aes(x = log2(baseMean), y = peak.number, label = miR.family, size = peak.number)) +
geom_point(colour = "#EC469A", alpha = 0.6) +
geom_hline(yintercept = 0, linetype = "dashed", size = 0.3) +
geom_vline(xintercept = 0, linetype = "dashed", size = 0.3) +
xlab("Log2(Ago2-bound miRNA abundance)") +
ylab("Number of HEAP-CLIP peaks mapped") +
theme_bw() +
theme(panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title.x = element_text(size=12, margin = margin(t = 10)),
axis.title.y = element_text(size=12, margin = margin(r = 10)),
axis.text = element_text(size=10),
axis.line.y = element_line(size = 0.5),
axis.line.x = element_line(size = 0.5),
axis.ticks.x = element_line(size = 0),
axis.ticks.y = element_line(size = 0.5),
plot.title = element_text(hjust = 0.5, size = 12, face = "bold")) +
xlim(5,18)
p1 <- ggplotly(p1)
p1
# calculate the correlation between these two factors
cor.test(log10(mirna.family.DGE$baseMean), mirna.family.DGE$peak.number)
##
## Pearson's product-moment correlation
##
## data: log10(mirna.family.DGE$baseMean) and mirna.family.DGE$peak.number
## t = 6.0936, df = 86, p-value = 3.024e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3838498 0.6803379
## sample estimates:
## cor
## 0.5491439